In [117]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from __future__ import print_function
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

%matplotlib inline
%config InlineBackend.figure_format = 'png'
pd.set_option("max_columns",50)

In [2]:


In [3]:
%%time
df = pd.read_csv("../data/train_2013.csv", index_col=0)


Wall time: 39 s

In [4]:
df["date_time"] = pd.to_datetime(df["date_time"], errors="coerce")

In [6]:
# %%time
# skip_col = ["date_time","orig_destination_distance"]
# for col in df_1.columns:
#     if col == skip_col:
#         pass
#     print(col, np.unique(df_1[col].astype(str)))

In [35]:
# check in / check out / distance  => nan값 존재

In [9]:
%%time
df = df.reset_index(drop=True)
# 10000명의 데이터만 사용
df = df.ix[:9999]


Wall time: 1.87 s

In [11]:
df.to_csv("train_2013_10000.csv")

In [36]:
df.columns


Out[36]:
Index([u'date_time', u'site_name', u'posa_continent', u'user_location_country',
       u'user_location_region', u'user_location_city',
       u'orig_destination_distance', u'user_id', u'is_mobile', u'is_package',
       u'channel', u'srch_ci', u'srch_co', u'srch_adults_cnt',
       u'srch_children_cnt', u'srch_rm_cnt', u'srch_destination_id',
       u'srch_destination_type_id', u'is_booking', u'cnt', u'hotel_continent',
       u'hotel_country', u'hotel_market', u'hotel_cluster'],
      dtype='object')

In [37]:
cols = df.columns.tolist()[-6:] + df.columns.tolist()[:-6]
df = df[cols]

In [46]:
cols = df.columns.tolist()[:1] + df.columns.tolist()[6:] + df.columns.tolist()[1:6] 
df = df[cols]

In [48]:
# 제거할 feature 생각해보기

In [53]:
df.head()


Out[53]:
is_booking date_time site_name posa_continent user_location_country user_location_region user_location_city orig_destination_distance user_id is_mobile is_package channel srch_ci srch_co srch_adults_cnt srch_children_cnt srch_rm_cnt srch_destination_id srch_destination_type_id cnt hotel_continent hotel_country hotel_market hotel_cluster
0 0 2013-06-15 15:10:49 30 4 195 548 56440 NaN 1048 0 1 9 2013-09-07 2013-09-15 2 0 1 1385 1 1 0 185 185 58
1 1 2013-06-15 15:38:05 30 4 195 548 56440 NaN 1048 0 1 9 2013-09-06 2013-09-14 2 0 1 1385 1 1 0 185 185 58
2 0 2013-02-15 13:18:43 2 3 66 462 41898 2716.6746 1482 0 0 1 2013-02-24 2013-03-01 2 0 1 8857 1 1 2 50 214 28
3 0 2013-02-16 11:57:50 2 3 66 462 41898 2716.5257 1482 0 0 0 2013-02-24 2013-03-01 2 0 1 8857 1 1 2 50 214 73
4 0 2013-02-16 12:03:45 2 3 66 462 41898 2722.4856 1482 0 0 0 2013-02-24 2013-03-01 2 0 1 8857 1 1 2 50 214 26

In [49]:
df.columns
#


Out[49]:
Index([u'is_booking', u'date_time', u'site_name', u'posa_continent',
       u'user_location_country', u'user_location_region',
       u'user_location_city', u'orig_destination_distance', u'user_id',
       u'is_mobile', u'is_package', u'channel', u'srch_ci', u'srch_co',
       u'srch_adults_cnt', u'srch_children_cnt', u'srch_rm_cnt',
       u'srch_destination_id', u'srch_destination_type_id', u'cnt',
       u'hotel_continent', u'hotel_country', u'hotel_market',
       u'hotel_cluster'],
      dtype='object')

In [68]:
delete_list = ["user_location_city", "user_location_region","is_mobile","is_package","hotel_country","hotel_market"]

In [70]:
df = df.drop(delete_list, axis=1)

In [75]:
print(df.columns, len(df.columns))


Index([u'is_booking', u'date_time', u'site_name', u'posa_continent',
       u'user_location_country', u'orig_destination_distance', u'user_id',
       u'channel', u'srch_ci', u'srch_co', u'srch_adults_cnt',
       u'srch_children_cnt', u'srch_rm_cnt', u'srch_destination_id',
       u'srch_destination_type_id', u'cnt', u'hotel_continent',
       u'hotel_cluster'],
      dtype='object') 18

In [84]:
df = df.drop(["posa_continent","orig_destination_distance", "srch_destination_type_id"], axis=1)

In [83]:
for col in df.columns:
    if col == "date_time":
        continue
    print(df[col].value_counts())
# df["posa_continent"].value_counts()


0    8973
1    1027
Name: is_booking, dtype: int64
2     6498
37    1609
11     495
34     429
24     293
13     190
17     132
8       60
9       53
32      46
25      42
22      42
40      34
33      25
30      17
35      10
28       8
26       7
18       4
10       3
19       3
Name: site_name, dtype: int64
3    7459
1    2038
2     416
4      77
0      10
Name: posa_continent, dtype: int64
66     5960
69     1429
205     860
3       237
46      227
133     184
68      116
225      93
80       72
77       63
70       62
32       53
28       46
62       45
215      44
23       39
231      38
0        37
235      35
154      34
239      33
198      31
85       23
93       22
195      19
29       19
158      18
194      17
182      17
142      14
48       13
148      12
168       9
115       7
27        6
12        6
162       6
51        5
103       5
208       5
1         5
39        5
82        5
119       4
54        3
64        3
157       3
202       3
5         2
57        2
190       2
166       2
Name: user_location_country, dtype: int64
5615.1972    25
202.0282     18
5797.7663    13
99.3290      13
96.6554      12
63.1618      10
87.9549      10
1590.7968     9
1868.9003     9
444.0290      9
2381.9139     9
295.6522      9
394.6765      8
318.2518      8
1235.6843     8
188.5710      8
51.6725       7
4527.2234     7
192.3717      7
6595.6508     6
79.0066       6
1917.9193     6
4527.4575     6
4491.8270     6
16.9159       6
1657.3570     6
1036.2993     6
142.9752      6
5095.1241     6
1622.4278     6
             ..
2528.9976     1
2226.1895     1
62.0251       1
139.6310      1
267.9601      1
2298.5065     1
1265.1169     1
2407.0524     1
734.3049      1
229.9292      1
1578.9702     1
218.9363      1
34.6006       1
407.4950      1
3351.0581     1
625.4276      1
1236.5718     1
1778.1646     1
4314.3084     1
1975.4526     1
192.7684      1
191.8491      1
258.0471      1
27.6370       1
157.6479      1
5809.3784     1
8407.6806     1
16.7685       1
770.8005      1
369.4375      1
Name: orig_destination_distance, dtype: int64
70535     345
33803     228
94390     212
71855     166
69003     156
121433    154
50191     152
123225    142
122669    115
81357     110
85275      98
108285     96
76943      92
70340      91
78474      86
34019      85
90864      84
9616       77
72708      76
125389     72
82160      71
115418     69
134677     69
38878      67
106813     65
117339     65
112433     65
118142     64
41165      64
88429      63
         ... 
111660      1
99735       1
54981       1
88681       1
18287       1
131587      1
82601       1
7523        1
134530      1
78546       1
101223      1
64523       1
113534      1
87044       1
17692       1
93505       1
85175       1
54670       1
70929       1
76809       1
97827       1
70663       1
78773       1
77811       1
97028       1
6300        1
98150       1
75637       1
32458       1
88198       1
Name: user_id, dtype: int64
9     5800
0     1273
1     1030
2      688
3      470
5      396
4      260
7       55
6       15
8       12
10       1
Name: channel, dtype: int64
2013-05-04    131
2013-10-18     94
2013-10-05     80
2013-12-29     80
2013-05-07     76
2013-07-03     71
2013-09-13     69
2013-03-21     66
2013-08-15     64
2013-08-11     64
2013-11-30     63
2013-08-09     62
2013-08-29     62
2013-08-17     59
2013-08-30     58
2013-03-29     58
2013-10-25     58
2013-10-04     57
2013-09-10     57
2013-08-24     56
2013-07-05     54
2013-10-17     54
2013-08-13     53
2013-08-16     53
2013-10-26     52
2013-05-08     51
2013-07-24     50
2013-03-07     49
2013-03-14     49
2013-12-27     49
             ... 
2014-07-22      1
2014-03-01      1
2014-05-15      1
2014-06-23      1
2014-05-28      1
2014-05-22      1
2014-04-25      1
2014-07-14      1
2014-08-06      1
2014-03-13      1
2014-03-19      1
2014-05-30      1
2014-05-08      1
2014-08-11      1
2014-04-19      1
2014-07-02      1
2014-03-12      1
2014-04-09      1
2014-06-08      1
2014-05-14      1
2013-01-08      1
2014-05-04      1
2014-12-08      1
2014-09-03      1
2014-01-31      1
2014-05-18      1
2014-05-02      1
2014-03-14      1
2014-03-09      1
2014-06-04      1
Name: srch_ci, dtype: int64
2013-05-08    178
2013-10-20    127
2013-03-24    101
2013-12-31     93
2013-12-01     79
2013-09-15     75
2013-07-12     71
2013-11-09     66
2013-09-02     65
2013-07-07     63
2013-08-23     62
2013-08-21     62
2013-11-27     62
2013-11-28     62
2013-05-03     59
2013-03-16     59
2013-08-04     59
2013-03-07     59
2013-08-26     59
2013-09-22     57
2013-10-11     56
2013-12-29     55
2013-05-10     55
2013-09-14     55
2013-06-07     55
2013-09-26     55
2013-05-05     54
2013-05-09     54
2013-10-12     52
2013-08-09     51
             ... 
2013-07-24      1
2014-02-25      1
2014-10-03      1
2014-02-11      1
2014-08-09      1
2013-08-05      1
2014-04-25      1
2014-04-26      1
2014-12-14      1
2014-07-19      1
2014-06-03      1
2014-05-23      1
2013-01-26      1
2014-06-23      1
2013-02-11      1
2014-03-10      1
2014-06-01      1
2014-05-09      1
2014-06-06      1
2014-03-26      1
2014-06-24      1
2014-04-19      1
2014-07-09      1
2014-05-18      1
2014-09-05      1
2014-03-12      1
2014-06-12      1
2014-12-22      1
2014-06-19      1
2014-03-25      1
Name: srch_co, dtype: int64
2    6208
1    2456
4     707
3     418
5      98
6      58
8      32
0      17
9       4
7       2
Name: srch_adults_cnt, dtype: int64
0    7193
1    1845
2     711
3     233
4       7
5       6
9       5
Name: srch_children_cnt, dtype: int64
1    8955
2     794
3     124
5      41
4      37
8      32
6      12
7       5
Name: srch_rm_cnt, dtype: int64
8267     296
8250     279
8746     194
12206    138
8268     131
8791     108
11439     98
8279      91
8278      89
7635      87
8745      84
8230      83
8220      82
8260      75
44045     70
468       65
8788      62
12257     61
8253      58
8213      57
12264     57
8855      56
8862      52
9147      48
12190     48
8266      47
20225     47
11353     47
12191     46
12227     46
        ... 
3093       1
11977      1
3789       1
12215      1
10320      1
40522      1
28593      1
43240      1
11972      1
4287       1
41137      1
24282      1
22235      1
24298      1
24322      1
45147      1
12363      1
61193      1
20328      1
28585      1
28556      1
12193      1
6043       1
12153      1
6051       1
28481      1
7999       1
20400      1
14138      1
22525      1
Name: srch_destination_id, dtype: int64
1    5608
6    2667
3     894
5     409
4     403
8      19
Name: srch_destination_type_id, dtype: int64
1     7005
2     1653
3      669
4      330
5      149
6       69
7       48
8       23
9       16
11      14
10       6
14       5
13       4
12       3
16       3
15       1
17       1
23       1
Name: cnt, dtype: int64
2    5575
6    2219
4    1088
3     848
5     188
0      82
Name: hotel_continent, dtype: int64
91    306
48    258
41    249
64    216
25    195
42    172
10    171
16    161
95    157
97    153
65    153
50    150
46    144
21    143
68    140
30    137
18    137
47    137
37    132
70    130
83    129
59    128
6     128
98    128
5     127
58    126
9     122
2     119
1     118
72    116
     ... 
15     71
39     69
38     69
31     68
14     67
20     66
19     66
92     64
12     63
67     60
45     60
43     60
51     59
79     57
66     57
60     56
93     53
71     50
49     48
23     48
75     46
35     41
87     40
63     38
88     35
24     33
53     33
80     25
27     18
74      6
Name: hotel_cluster, dtype: int64

In [88]:
# 별다른 Feature Engineering을 하지 않고 제거해서 model을 돌려보자

In [100]:
df["srch_ci"] = pd.to_datetime(df["srch_ci"], errors="coerce")
df["srch_co"] = pd.to_datetime(df["srch_co"], errors="coerce")

In [118]:
le = preprocessing.LabelEncoder()

In [121]:
df["srch_ci"] = le.fit_transform(df["srch_ci"])
df["srch_co"] = le.fit_transform(df["srch_co"])

In [134]:
df["date_time"] = df["date_time"].dt.date
df["date_time"] = le.fit_transform(df["date_time"])

In [135]:
trn_x = df.ix[:,1:]
trn_y = df.ix[:,:1]

In [136]:
model = RandomForestClassifier(max_depth=3, n_jobs=-1, random_state=402)

y 를 잘못 설정해부렸던 case


In [137]:
model.fit(trn_x,trn_y)


C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:1: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
  if __name__ == '__main__':
Out[137]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=402, verbose=0, warm_start=False)

In [153]:
importances = model.feature_importances_

std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(trn_x.shape[1]):
#     print(indices[f])
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], trn_x.columns[indices[f]], importances[indices[f]]))

plt.title("Feature importances")
plt.bar(range(trn_x.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(trn_x.shape[1]), indices)
plt.xlim([-1, trn_x.shape[1]])
plt.show()


Feature ranking:
1. feature 11 cnt (0.533188)
2. feature 6 srch_co (0.102094)
3. feature 7 srch_adults_cnt (0.091235)
4. feature 12 hotel_continent (0.050153)
5. feature 3 user_id (0.042140)
6. feature 4 channel (0.039411)
7. feature 8 srch_children_cnt (0.036964)
8. feature 10 srch_destination_id (0.032244)
9. feature 5 srch_ci (0.030716)
10. feature 9 srch_rm_cnt (0.021721)
11. feature 1 site_name (0.009355)
12. feature 13 hotel_cluster (0.007323)
13. feature 0 date_time (0.003447)
14. feature 2 user_location_country (0.000008)

In [ ]:
# 1. feature 11 cnt (0.533188)
# 2. feature 6 srch_co (0.102094)
# 3. feature 7 srch_adults_cnt (0.091235)
# 4. feature 12 hotel_continent (0.050153)
# 5. feature 3 user_id (0.042140)
# 6. feature 4 channel (0.039411)
# 7. feature 8 srch_children_cnt (0.036964)
# 8. feature 10 srch_destination_id (0.032244)
# 9. feature 5 srch_ci (0.030716)
# 10. feature 9 srch_rm_cnt (0.021721)
# 11. feature 1 site_name (0.009355)
# 12. feature 13 hotel_cluster (0.007323)
# 13. feature 0 date_time (0.003447)
# 14. feature 2 user_location_country (0.000008) => 제거

In [154]:
sub_ex = pd.read_csv("../sample_submission.csv")

In [164]:
sub_ex.head()


Out[164]:
id hotel_cluster
0 0 99 1
1 1 99 1
2 2 99 1
3 3 99 1
4 4 99 1

In [158]:
trn_x.head()


Out[158]:
date_time site_name user_location_country user_id channel srch_ci srch_co srch_adults_cnt srch_children_cnt srch_rm_cnt srch_destination_id cnt hotel_continent hotel_cluster
0 159 30 195 1048 9 239 245 2 0 1 1385 1 0 58
1 159 30 195 1048 9 238 244 2 0 1 1385 1 0 58
2 39 2 66 1482 1 44 47 2 0 1 8857 1 2 28
3 40 2 66 1482 0 44 47 2 0 1 8857 1 2 73
4 40 2 66 1482 0 44 47 2 0 1 8857 1 2 26

In [ ]:
# is_booking이 y라고 생각했는데 다시 생각해보니 hotel_cluster가 중요함

In [163]:
trn_x1 = df.ix[:,:-1]
trn_y1 = df.ix[:,-1:]

model.fit(trn_x1,trn_y1)

importances = model.feature_importances_

std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(trn_x1.shape[1]):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], trn_x1.columns[indices[f]], importances[indices[f]]))

plt.title("Feature importances")
plt.bar(range(trn_x1.shape[1]), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(trn_x1.shape[1]), indices)
plt.xlim([-1, trn_x1.shape[1]])
plt.show()
# 위로 10000개 잡고, 샘플링, 다시 나오나 보고 변한다면, 데이터가 흔들리는지 확인
# feature 샘플링.


C:\Users\Byeon\Anaconda3\envs\py27\lib\site-packages\ipykernel\__main__.py:4: DataConversionWarning: A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().
Feature ranking:
1. feature 13 hotel_continent (0.364650)
2. feature 11 srch_destination_id (0.172174)
3. feature 2 site_name (0.159535)
4. feature 3 user_location_country (0.102945)
5. feature 4 user_id (0.077250)
6. feature 7 srch_co (0.027816)
7. feature 6 srch_ci (0.019617)
8. feature 10 srch_rm_cnt (0.017608)
9. feature 9 srch_children_cnt (0.017537)
10. feature 1 date_time (0.014128)
11. feature 8 srch_adults_cnt (0.011914)
12. feature 5 channel (0.010902)
13. feature 12 cnt (0.003925)
14. feature 0 is_booking (0.000000)

In [ ]:
# Feature ranking:
# 1. feature 13 hotel_continent (0.364650) 
# 2. feature 11 srch_destination_id (0.172174)
# 3. feature 2 site_name (0.159535)
# 4. feature 3 user_location_country (0.102945)
# 5. feature 4 user_id (0.077250)
# 6. feature 7 srch_co (0.027816)
# 7. feature 6 srch_ci (0.019617)
# 8. feature 10 srch_rm_cnt (0.017608)
# 9. feature 9 srch_children_cnt (0.017537)
# 10. feature 1 date_time (0.014128)
# 11. feature 8 srch_adults_cnt (0.011914)
# 12. feature 5 channel (0.010902)
# 13. feature 12 cnt (0.003925)
# 14. feature 0 is_booking (0.000000)


# co-ci 기간 변수 
# is_booking한 사람의

Feature engineering


In [179]:
df.head()


Out[179]:
is_booking date_time site_name user_location_country user_id channel srch_ci srch_co srch_adults_cnt srch_children_cnt srch_rm_cnt srch_destination_id cnt hotel_continent hotel_cluster
0 0 159 30 195 1048 9 239 245 2 0 1 1385 1 0 58
1 1 159 30 195 1048 9 238 244 2 0 1 1385 1 0 58
2 0 39 2 66 1482 1 44 47 2 0 1 8857 1 2 28
3 0 40 2 66 1482 0 44 47 2 0 1 8857 1 2 73
4 0 40 2 66 1482 0 44 47 2 0 1 8857 1 2 26